{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "data=pd.read_csv('F:/course/Logistic regression/credit_training.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# seems like the first column is not needed\n",
    "data=data.drop(['Unnamed: 0'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SeriousDlqin2yrs</th>\n",
       "      <th>RevolvingUtilizationOfUnsecuredLines</th>\n",
       "      <th>age</th>\n",
       "      <th>NumberOfTime30to59DaysPastDueNotWorse</th>\n",
       "      <th>DebtRatio</th>\n",
       "      <th>MonthlyIncome</th>\n",
       "      <th>NumberOfOpenCreditLinesAndLoans</th>\n",
       "      <th>NumberOfTimes90DaysLate</th>\n",
       "      <th>NumberRealEstateLoansOrLines</th>\n",
       "      <th>NumberOfTime60to89DaysPastDueNotWorse</th>\n",
       "      <th>NumberOfDependents</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>150000.000000</td>\n",
       "      <td>150000.000000</td>\n",
       "      <td>150000.000000</td>\n",
       "      <td>150000.000000</td>\n",
       "      <td>150000.000000</td>\n",
       "      <td>1.202690e+05</td>\n",
       "      <td>150000.000000</td>\n",
       "      <td>150000.000000</td>\n",
       "      <td>150000.000000</td>\n",
       "      <td>150000.000000</td>\n",
       "      <td>146076.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>0.066840</td>\n",
       "      <td>6.048438</td>\n",
       "      <td>52.295207</td>\n",
       "      <td>0.421033</td>\n",
       "      <td>353.005076</td>\n",
       "      <td>6.670221e+03</td>\n",
       "      <td>8.452760</td>\n",
       "      <td>0.265973</td>\n",
       "      <td>1.018240</td>\n",
       "      <td>0.240387</td>\n",
       "      <td>0.757222</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.249746</td>\n",
       "      <td>249.755371</td>\n",
       "      <td>14.771866</td>\n",
       "      <td>4.192781</td>\n",
       "      <td>2037.818523</td>\n",
       "      <td>1.438467e+04</td>\n",
       "      <td>5.145951</td>\n",
       "      <td>4.169304</td>\n",
       "      <td>1.129771</td>\n",
       "      <td>4.155179</td>\n",
       "      <td>1.115086</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.029867</td>\n",
       "      <td>41.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.175074</td>\n",
       "      <td>3.400000e+03</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.154181</td>\n",
       "      <td>52.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.366508</td>\n",
       "      <td>5.400000e+03</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.559046</td>\n",
       "      <td>63.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.868254</td>\n",
       "      <td>8.249000e+03</td>\n",
       "      <td>11.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>50708.000000</td>\n",
       "      <td>109.000000</td>\n",
       "      <td>98.000000</td>\n",
       "      <td>329664.000000</td>\n",
       "      <td>3.008750e+06</td>\n",
       "      <td>58.000000</td>\n",
       "      <td>98.000000</td>\n",
       "      <td>54.000000</td>\n",
       "      <td>98.000000</td>\n",
       "      <td>20.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       SeriousDlqin2yrs  RevolvingUtilizationOfUnsecuredLines            age  \\\n",
       "count     150000.000000                         150000.000000  150000.000000   \n",
       "mean           0.066840                              6.048438      52.295207   \n",
       "std            0.249746                            249.755371      14.771866   \n",
       "min            0.000000                              0.000000       0.000000   \n",
       "25%            0.000000                              0.029867      41.000000   \n",
       "50%            0.000000                              0.154181      52.000000   \n",
       "75%            0.000000                              0.559046      63.000000   \n",
       "max            1.000000                          50708.000000     109.000000   \n",
       "\n",
       "       NumberOfTime30to59DaysPastDueNotWorse      DebtRatio  MonthlyIncome  \\\n",
       "count                          150000.000000  150000.000000   1.202690e+05   \n",
       "mean                                0.421033     353.005076   6.670221e+03   \n",
       "std                                 4.192781    2037.818523   1.438467e+04   \n",
       "min                                 0.000000       0.000000   0.000000e+00   \n",
       "25%                                 0.000000       0.175074   3.400000e+03   \n",
       "50%                                 0.000000       0.366508   5.400000e+03   \n",
       "75%                                 0.000000       0.868254   8.249000e+03   \n",
       "max                                98.000000  329664.000000   3.008750e+06   \n",
       "\n",
       "       NumberOfOpenCreditLinesAndLoans  NumberOfTimes90DaysLate  \\\n",
       "count                    150000.000000            150000.000000   \n",
       "mean                          8.452760                 0.265973   \n",
       "std                           5.145951                 4.169304   \n",
       "min                           0.000000                 0.000000   \n",
       "25%                           5.000000                 0.000000   \n",
       "50%                           8.000000                 0.000000   \n",
       "75%                          11.000000                 0.000000   \n",
       "max                          58.000000                98.000000   \n",
       "\n",
       "       NumberRealEstateLoansOrLines  NumberOfTime60to89DaysPastDueNotWorse  \\\n",
       "count                 150000.000000                          150000.000000   \n",
       "mean                       1.018240                               0.240387   \n",
       "std                        1.129771                               4.155179   \n",
       "min                        0.000000                               0.000000   \n",
       "25%                        0.000000                               0.000000   \n",
       "50%                        1.000000                               0.000000   \n",
       "75%                        2.000000                               0.000000   \n",
       "max                       54.000000                              98.000000   \n",
       "\n",
       "       NumberOfDependents  \n",
       "count       146076.000000  \n",
       "mean             0.757222  \n",
       "std              1.115086  \n",
       "min              0.000000  \n",
       "25%              0.000000  \n",
       "50%              0.000000  \n",
       "75%              1.000000  \n",
       "max             20.000000  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Seems like monthlyincome & numberofdependents have some missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# There are multiple ways in which we can fix missing values\n",
    "# for now, lets just impute missing values with median of that column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data['MonthlyIncome']=data['MonthlyIncome'].fillna(value=data['MonthlyIncome'].median())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data['NumberOfDependents']=data['NumberOfDependents'].fillna(value=data['NumberOfDependents'].median())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Admin\\Anaconda2\\lib\\site-packages\\ipykernel\\__main__.py:2: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n",
      "  from ipykernel import kernelapp as app\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "sample=np.random.randint(0,data.shape[0],np.around(data.shape[0]*0.8))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train=data.loc[sample]\n",
    "test=data.drop(data.index[sample])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X = data.drop('SeriousDlqin2yrs',axis=1)\n",
    "y = data['SeriousDlqin2yrs']\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "rfc = RandomForestClassifier(n_estimators=100,max_depth=5,min_samples_leaf=100,random_state=10)\n",
    "rfc.fit(X_train, y_train)\n",
    "rfc_pred=rfc.predict_proba(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.86111702216953034"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import roc_auc_score\n",
    "roc_auc_score(y_test, rfc_pred[:,1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
